{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Lab 26 - k-Nearest Neighbors classifier 2\n", "\n", "We will continue using the Titanic training and test data from [Kaggle](https://www.kaggle.com/c/titanic) from Lab 24 and 25.\n", "\n", "First import the necessary libraries." ] }, { "cell_type": "code", "execution_count": 34, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import pandas as pd\n", "from sklearn.neighbors import KNeighborsClassifier\n", "from sklearn.model_selection import train_test_split\n", "%matplotlib inline" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Loading and cleaning the data\n", "\n", "The code for loading in the data and cleaning it from Lab 25 is below." ] }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
0103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS
1211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85C
2313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNS
3411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123S
4503Allen, Mr. William Henrymale35.0003734508.0500NaNS
\n", "
" ], "text/plain": [ " PassengerId Survived Pclass \\\n", "0 1 0 3 \n", "1 2 1 1 \n", "2 3 1 3 \n", "3 4 1 1 \n", "4 5 0 3 \n", "\n", " Name Sex Age SibSp \\\n", "0 Braund, Mr. Owen Harris male 22.0 1 \n", "1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n", "2 Heikkinen, Miss. Laina female 26.0 0 \n", "3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n", "4 Allen, Mr. William Henry male 35.0 0 \n", "\n", " Parch Ticket Fare Cabin Embarked \n", "0 0 A/5 21171 7.2500 NaN S \n", "1 0 PC 17599 71.2833 C85 C \n", "2 0 STON/O2. 3101282 7.9250 NaN S \n", "3 0 113803 53.1000 C123 S \n", "4 0 373450 8.0500 NaN S " ] }, "execution_count": 55, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train = pd.read_csv(\"../Data/train.csv\")\n", "train.head()" ] }, { "cell_type": "code", "execution_count": 56, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# fill in missing age data\n", "train[\"Age\"] = train[\"Age\"].fillna(train[\"Age\"].median())\n", "\n", "# fill in the missing embarked data\n", "train[\"Embarked\"] = train[\"Embarked\"].fillna(\"S\")" ] }, { "cell_type": "code", "execution_count": 57, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PassengerIdSurvivedNameAgeSibSpParchTicketFareCabinPclass_2Pclass_3Sex_maleEmbarked_QEmbarked_S
010Braund, Mr. Owen Harris22.010A/5 211717.2500NaN01101
121Cumings, Mrs. John Bradley (Florence Briggs Th...38.010PC 1759971.2833C8500000
231Heikkinen, Miss. Laina26.000STON/O2. 31012827.9250NaN01001
341Futrelle, Mrs. Jacques Heath (Lily May Peel)35.01011380353.1000C12300001
450Allen, Mr. William Henry35.0003734508.0500NaN01101
\n", "
" ], "text/plain": [ " PassengerId Survived Name \\\n", "0 1 0 Braund, Mr. Owen Harris \n", "1 2 1 Cumings, Mrs. John Bradley (Florence Briggs Th... \n", "2 3 1 Heikkinen, Miss. Laina \n", "3 4 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) \n", "4 5 0 Allen, Mr. William Henry \n", "\n", " Age SibSp Parch Ticket Fare Cabin Pclass_2 Pclass_3 \\\n", "0 22.0 1 0 A/5 21171 7.2500 NaN 0 1 \n", "1 38.0 1 0 PC 17599 71.2833 C85 0 0 \n", "2 26.0 0 0 STON/O2. 3101282 7.9250 NaN 0 1 \n", "3 35.0 1 0 113803 53.1000 C123 0 0 \n", "4 35.0 0 0 373450 8.0500 NaN 0 1 \n", "\n", " Sex_male Embarked_Q Embarked_S \n", "0 1 0 1 \n", "1 0 0 0 \n", "2 0 0 1 \n", "3 0 0 1 \n", "4 1 0 1 " ] }, "execution_count": 57, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# create dummy variables for passenger class, sex, and embarked\n", "train2 = pd.get_dummies(train, columns = [\"Pclass\",\"Sex\",\"Embarked\"], drop_first = True)\n", "train2.head()" ] }, { "cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [], "source": [ "# remove the remaining qualitative columns\n", "train2.drop(\"Cabin\",axis = 1,inplace = True)\n", "train2.drop(\"Name\",axis = 1,inplace = True)\n", "train2.drop(\"Ticket\",axis = 1,inplace = True)\n", "\n", "# we should also drop PassengerId, although we did not last lab\n", "train2.drop(\"PassengerId\",axis = 1, inplace = True)" ] }, { "cell_type": "code", "execution_count": 59, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# split the original training data into training and sets sets\n", "X_train,X_test,y_train, y_test =train_test_split(train2.drop(\"Survived\",axis=1),train2[\"Survived\"],test_size = 0.2)" ] }, { "cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.72625698324022347" ] }, "execution_count": 60, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# create a 3-nearest neighbor classifirer\n", "knn = KNeighborsClassifier(n_neighbors=3)\n", "# fit the classifier to the training data\n", "knn.fit(X_train, y_train)\n", "# test and score the classifier on our test data (part of the original training data)\n", "# notice this line corrects a mistake in Lab 25\n", "knn.score(X_test, y_test)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now we are going to try running our classifier on the Kaggle test data and use all of our training data to fit the classifier.\n", "\n", "First, load the test data from Kaggle into the dataframe `test`." ] }, { "cell_type": "code", "execution_count": 61, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PassengerIdPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
08923Kelly, Mr. Jamesmale34.5003309117.8292NaNQ
18933Wilkes, Mrs. James (Ellen Needs)female47.0103632727.0000NaNS
28942Myles, Mr. Thomas Francismale62.0002402769.6875NaNQ
38953Wirz, Mr. Albertmale27.0003151548.6625NaNS
48963Hirvonen, Mrs. Alexander (Helga E Lindqvist)female22.011310129812.2875NaNS
\n", "
" ], "text/plain": [ " PassengerId Pclass Name Sex \\\n", "0 892 3 Kelly, Mr. James male \n", "1 893 3 Wilkes, Mrs. James (Ellen Needs) female \n", "2 894 2 Myles, Mr. Thomas Francis male \n", "3 895 3 Wirz, Mr. Albert male \n", "4 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female \n", "\n", " Age SibSp Parch Ticket Fare Cabin Embarked \n", "0 34.5 0 0 330911 7.8292 NaN Q \n", "1 47.0 1 0 363272 7.0000 NaN S \n", "2 62.0 0 0 240276 9.6875 NaN Q \n", "3 27.0 0 0 315154 8.6625 NaN S \n", "4 22.0 1 1 3101298 12.2875 NaN S " ] }, "execution_count": 61, "metadata": {}, "output_type": "execute_result" } ], "source": [ "test = pd.read_csv(\"../Data/test.csv\")\n", "test.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We have to process the test data in the same way as the training data, namely filling in the missing age and embarked data, creating the dummy variables for Pclass, Sex, and Embarked, and dropping the Cabin, Name, and Ticket columns. Do this below, adding as many extra cells as you need." ] }, { "cell_type": "code", "execution_count": 62, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PassengerIdAgeSibSpParchFarePclass_2Pclass_3Sex_maleEmbarked_QEmbarked_S
089234.5007.829201110
189347.0107.000001001
289462.0009.687510110
389527.0008.662501101
489622.01112.287501001
\n", "
" ], "text/plain": [ " PassengerId Age SibSp Parch Fare Pclass_2 Pclass_3 Sex_male \\\n", "0 892 34.5 0 0 7.8292 0 1 1 \n", "1 893 47.0 1 0 7.0000 0 1 0 \n", "2 894 62.0 0 0 9.6875 1 0 1 \n", "3 895 27.0 0 0 8.6625 0 1 1 \n", "4 896 22.0 1 1 12.2875 0 1 0 \n", "\n", " Embarked_Q Embarked_S \n", "0 1 0 \n", "1 0 1 \n", "2 1 0 \n", "3 0 1 \n", "4 0 1 " ] }, "execution_count": 62, "metadata": {}, "output_type": "execute_result" } ], "source": [ "test[\"Age\"] = test[\"Age\"].fillna(test[\"Age\"].median())\n", "test[\"Embarked\"] = test[\"Embarked\"].fillna(\"S\")\n", "\n", "test2 = pd.get_dummies(test, columns = [\"Pclass\",\"Sex\",\"Embarked\"], drop_first = True)\n", "\n", "test2.drop(\"Cabin\",axis = 1,inplace = True)\n", "test2.drop(\"Name\",axis = 1,inplace = True)\n", "test2.drop(\"Ticket\",axis = 1,inplace = True)\n", "\n", "test2.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Store the column PassengerId in a variable. We need this information for submitting our predictions to Kaggle, but don't want to use it in making the pedictions." ] }, { "cell_type": "code", "execution_count": 63, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 892\n", "1 893\n", "2 894\n", "3 895\n", "4 896\n", "5 897\n", "6 898\n", "7 899\n", "8 900\n", "9 901\n", "10 902\n", "11 903\n", "12 904\n", "13 905\n", "14 906\n", "15 907\n", "16 908\n", "17 909\n", "18 910\n", "19 911\n", "20 912\n", "21 913\n", "22 914\n", "23 915\n", "24 916\n", "25 917\n", "26 918\n", "27 919\n", "28 920\n", "29 921\n", " ... \n", "388 1280\n", "389 1281\n", "390 1282\n", "391 1283\n", "392 1284\n", "393 1285\n", "394 1286\n", "395 1287\n", "396 1288\n", "397 1289\n", "398 1290\n", "399 1291\n", "400 1292\n", "401 1293\n", "402 1294\n", "403 1295\n", "404 1296\n", "405 1297\n", "406 1298\n", "407 1299\n", "408 1300\n", "409 1301\n", "410 1302\n", "411 1303\n", "412 1304\n", "413 1305\n", "414 1306\n", "415 1307\n", "416 1308\n", "417 1309\n", "Name: PassengerId, Length: 418, dtype: int64" ] }, "execution_count": 63, "metadata": {}, "output_type": "execute_result" } ], "source": [ "passengerId = test2[\"PassengerId\"]\n", "passengerId" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Drop the `PassengerId` column from the test data." ] }, { "cell_type": "code", "execution_count": 64, "metadata": {}, "outputs": [], "source": [ "test2.drop(\"PassengerId\",axis = 1, inplace = True)" ] }, { "cell_type": "code", "execution_count": 65, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AgeSibSpParchFarePclass_2Pclass_3Sex_maleEmbarked_QEmbarked_S
034.5007.829201110
147.0107.000001001
262.0009.687510110
327.0008.662501101
422.01112.287501001
\n", "
" ], "text/plain": [ " Age SibSp Parch Fare Pclass_2 Pclass_3 Sex_male Embarked_Q \\\n", "0 34.5 0 0 7.8292 0 1 1 1 \n", "1 47.0 1 0 7.0000 0 1 0 0 \n", "2 62.0 0 0 9.6875 1 0 1 1 \n", "3 27.0 0 0 8.6625 0 1 1 0 \n", "4 22.0 1 1 12.2875 0 1 0 0 \n", "\n", " Embarked_S \n", "0 0 \n", "1 1 \n", "2 0 \n", "3 1 \n", "4 1 " ] }, "execution_count": 65, "metadata": {}, "output_type": "execute_result" } ], "source": [ "test2.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Next we split up our training data into the answer (the `Survived` column) and the input data (all other columns).\n", "\n", "First, store the `Survived` column in the variable `y_train_kaggle`." ] }, { "cell_type": "code", "execution_count": 72, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 0\n", "1 1\n", "2 1\n", "3 1\n", "4 0\n", "Name: Survived, dtype: int64" ] }, "execution_count": 72, "metadata": {}, "output_type": "execute_result" } ], "source": [ "y_train_kaggle = train2[\"Survived\"]\n", "y_train_kaggle.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Next, drop the `Survived` column from the training data, and store the new data frame in the variable `X_train_kaggle`." ] }, { "cell_type": "code", "execution_count": 71, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AgeSibSpParchFarePclass_2Pclass_3Sex_maleEmbarked_QEmbarked_S
022.0107.250001101
138.01071.283300000
226.0007.925001001
335.01053.100000001
435.0008.050001101
\n", "
" ], "text/plain": [ " Age SibSp Parch Fare Pclass_2 Pclass_3 Sex_male Embarked_Q \\\n", "0 22.0 1 0 7.2500 0 1 1 0 \n", "1 38.0 1 0 71.2833 0 0 0 0 \n", "2 26.0 0 0 7.9250 0 1 0 0 \n", "3 35.0 1 0 53.1000 0 0 0 0 \n", "4 35.0 0 0 8.0500 0 1 1 0 \n", "\n", " Embarked_S \n", "0 1 \n", "1 0 \n", "2 1 \n", "3 1 \n", "4 1 " ] }, "execution_count": 71, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_train_kaggle = train2.drop(\"Survived\",axis = 1)\n", "X_train_kaggle.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Create a new k-nearest neighbors object and fit it on the entire training data (`X_train_kaggle`)." ] }, { "cell_type": "code", "execution_count": 68, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n", " metric_params=None, n_jobs=1, n_neighbors=3, p=2,\n", " weights='uniform')" ] }, "execution_count": 68, "metadata": {}, "output_type": "execute_result" } ], "source": [ "knn_kaggle = KNeighborsClassifier(n_neighbors=3)\n", "knn_kaggle.fit(X_train_kaggle,y_train_kaggle)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now use our fitted classifier to make predictions on the test data (`test2`), and store it in the variable `y_pred`." ] }, { "cell_type": "code", "execution_count": 73, "metadata": {}, "outputs": [ { "ename": "ValueError", "evalue": "Input contains NaN, infinity or a value too large for dtype('float64').", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0my_pred\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mknn_kaggle\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpredict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtest2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;32m~/anaconda/lib/python3.5/site-packages/sklearn/neighbors/classification.py\u001b[0m in \u001b[0;36mpredict\u001b[0;34m(self, X)\u001b[0m\n\u001b[1;32m 141\u001b[0m \u001b[0mClass\u001b[0m \u001b[0mlabels\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0meach\u001b[0m \u001b[0mdata\u001b[0m \u001b[0msample\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 142\u001b[0m \"\"\"\n\u001b[0;32m--> 143\u001b[0;31m \u001b[0mX\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcheck_array\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maccept_sparse\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'csr'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 144\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 145\u001b[0m \u001b[0mneigh_dist\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mneigh_ind\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mkneighbors\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/anaconda/lib/python3.5/site-packages/sklearn/utils/validation.py\u001b[0m in \u001b[0;36mcheck_array\u001b[0;34m(array, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)\u001b[0m\n\u001b[1;32m 405\u001b[0m % (array.ndim, estimator_name))\n\u001b[1;32m 406\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mforce_all_finite\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 407\u001b[0;31m \u001b[0m_assert_all_finite\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marray\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 408\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 409\u001b[0m \u001b[0mshape_repr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_shape_repr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marray\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/anaconda/lib/python3.5/site-packages/sklearn/utils/validation.py\u001b[0m in \u001b[0;36m_assert_all_finite\u001b[0;34m(X)\u001b[0m\n\u001b[1;32m 56\u001b[0m and not np.isfinite(X).all()):\n\u001b[1;32m 57\u001b[0m raise ValueError(\"Input contains NaN, infinity\"\n\u001b[0;32m---> 58\u001b[0;31m \" or a value too large for %r.\" % X.dtype)\n\u001b[0m\u001b[1;32m 59\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 60\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mValueError\u001b[0m: Input contains NaN, infinity or a value too large for dtype('float64')." ] } ], "source": [ "y_pred = knn_kaggle.predict(test2)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The error message says something about NaN. Could there be missing data (NaN) is a different column in the test data? Use describe to see if this is the case." ] }, { "cell_type": "code", "execution_count": 74, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AgeSibSpParchFarePclass_2Pclass_3Sex_maleEmbarked_QEmbarked_S
count418.000000418.000000418.000000417.000000418.000000418.000000418.000000418.000000418.000000
mean29.5992820.4473680.39234435.6271880.2224880.5215310.6363640.1100480.645933
std12.7037700.8967600.98142955.9075760.4164160.5001350.4816220.3133240.478803
min0.1700000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000
25%23.0000000.0000000.0000007.8958000.0000000.0000000.0000000.0000000.000000
50%27.0000000.0000000.00000014.4542000.0000001.0000001.0000000.0000001.000000
75%35.7500001.0000000.00000031.5000000.0000001.0000001.0000000.0000001.000000
max76.0000008.0000009.000000512.3292001.0000001.0000001.0000001.0000001.000000
\n", "
" ], "text/plain": [ " Age SibSp Parch Fare Pclass_2 Pclass_3 \\\n", "count 418.000000 418.000000 418.000000 417.000000 418.000000 418.000000 \n", "mean 29.599282 0.447368 0.392344 35.627188 0.222488 0.521531 \n", "std 12.703770 0.896760 0.981429 55.907576 0.416416 0.500135 \n", "min 0.170000 0.000000 0.000000 0.000000 0.000000 0.000000 \n", "25% 23.000000 0.000000 0.000000 7.895800 0.000000 0.000000 \n", "50% 27.000000 0.000000 0.000000 14.454200 0.000000 1.000000 \n", "75% 35.750000 1.000000 0.000000 31.500000 0.000000 1.000000 \n", "max 76.000000 8.000000 9.000000 512.329200 1.000000 1.000000 \n", "\n", " Sex_male Embarked_Q Embarked_S \n", "count 418.000000 418.000000 418.000000 \n", "mean 0.636364 0.110048 0.645933 \n", "std 0.481622 0.313324 0.478803 \n", "min 0.000000 0.000000 0.000000 \n", "25% 0.000000 0.000000 0.000000 \n", "50% 1.000000 0.000000 1.000000 \n", "75% 1.000000 0.000000 1.000000 \n", "max 1.000000 1.000000 1.000000 " ] }, "execution_count": 74, "metadata": {}, "output_type": "execute_result" } ], "source": [ "test2.describe()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The fare column is missing one value. Fill it in with the median fare." ] }, { "cell_type": "code", "execution_count": 75, "metadata": {}, "outputs": [], "source": [ "test2[\"Fare\"] = test2[\"Fare\"].fillna(test2[\"Fare\"].median())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now try making the prediction again." ] }, { "cell_type": "code", "execution_count": 76, "metadata": { "collapsed": true }, "outputs": [], "source": [ "y_pred = knn_kaggle.predict(test2)" ] }, { "cell_type": "code", "execution_count": 77, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1,\n", " 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0,\n", " 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0,\n", " 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1,\n", " 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1,\n", " 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0,\n", " 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1,\n", " 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0,\n", " 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0,\n", " 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1,\n", " 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1,\n", " 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1,\n", " 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0,\n", " 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0,\n", " 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0,\n", " 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0,\n", " 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,\n", " 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1,\n", " 1, 0, 1, 1])" ] }, "execution_count": 77, "metadata": {}, "output_type": "execute_result" } ], "source": [ "y_pred" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Finally, we want to write our prediction to a file, along with the passenger ID. " ] }, { "cell_type": "code", "execution_count": 78, "metadata": {}, "outputs": [], "source": [ "df = pd.DataFrame(data = {\"PassengerId\":passengerId, \"Survived\":y_pred})" ] }, { "cell_type": "code", "execution_count": 79, "metadata": {}, "outputs": [], "source": [ "df.to_csv(\"test1.csv\",index = 0)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.6" } }, "nbformat": 4, "nbformat_minor": 2 }